Code
library (dplyr) ; library (tidyr) ; library (ggplot2)
library (doParallel) ; library (foreach) ; library (doSNOW)
root <- rprojroot:: has_file (".git/index" )
datadir = root$ find_file ("data" )
funsdir = root$ find_file ("functions" )
savingdir = root$ find_file ("saved_files" )
savingdirOceanSlices = root$ find_file ("saved_files_oceanSlices" )
datapath = root$ find_file (paste0 (datadir,'/' ,'grump.phaeocystis_asv_long.csv' ))
files_vec <- list.files (funsdir)
currentwd <- getwd ()
for ( i in 1 : length (files_vec)){
source (root$ find_file (paste0 (funsdir,'/' ,files_vec[i])))
}
dframe = data.table:: fread (input = datapath) %>% filter (Cruise!= "MOSAiC" ,Raw.Sequence.Counts> 0 )
dframe_allASVs = tidy_grump (Dframe = dframe)
The idea is to create a ‘custom’ distance matrix, to induce the grouppings.
\(c_1\) is within the same species - but no unassigned
\(c_2\) is between assigned species
\(c_3\) is between species and unassigned
\(c_4\) is within unassigned and unassigned
For this document we will use only the following configuration:
\(c_1=1\) , \(c_2=1000\) , \(c_3=10\) , \(c_4=10\)
A priori, the ’probability` of unassigned ASVs to agglomerate between themselves is the same as if it was to agglomerate with other species. Large \(c_2\) makes it harder for known species ASVs to group between themselves.
Code
inducedDist_ = inducedDist (
dFrame = dframe_allASVs$ dframe,
c1 = 1 ,c2= 1000 ,c3= 10 ,c4= 10 ,
compMatrix = dframe_allASVs$ ASV_composition)
If we exclude the ASVs that appears only one ~or two~ times, does it help?
Code
nASVs <- dframe_allASVs$ dframe %>% select (ASV_name) %>% distinct () %>% nrow ()
plot0s_summ = dframe_allASVs$ dframe %>% group_by (ASV_name,SampleKey) %>%
summarise (Freq= n ()) %>%
mutate (nSamplesObserved = sum (Freq)) %>%
arrange (nSamplesObserved) %>%
select (ASV_name,nSamplesObserved) %>% distinct () %>%
mutate (FreqNSamples= cut (nSamplesObserved,breaks = c (0 ,1 ,2 ,3 ,5 ,10 ,25 ,50 ,100 ,250 ,1500 ),right = T,include.lowest = T)) %>%
group_by (FreqNSamples) %>%
summarise (Freq= n (),Pct= n ()/ nASVs) %>%
mutate (CummPct = cumsum (Pct))
plot0s_summ %>% ggplot (aes (x= FreqNSamples,y= Freq))+ geom_bar (stat= 'identity' )
Code
plot0s_summ %>% ggplot (aes (x= FreqNSamples,y= CummPct))+ geom_bar (stat= 'identity' )
Code
plot0s_summ %>% knitr:: kable () %>% kableExtra:: kable_styling ()
[0,1]
715
0.4824561
0.4824561
(1,2]
169
0.1140351
0.5964912
(2,3]
80
0.0539811
0.6504723
(3,5]
103
0.0695007
0.7199730
(5,10]
121
0.0816464
0.8016194
(10,25]
129
0.0870445
0.8886640
(25,50]
67
0.0452092
0.9338731
(50,100]
48
0.0323887
0.9662618
(100,250]
41
0.0276653
0.9939271
(250,1.5e+03]
9
0.0060729
1.0000000
Code
### Creating vectors with id of ASVs to remove:
ASVs2Remove_df = dframe_allASVs$ dframe %>% group_by (ASV_name,SampleKey) %>%
summarise (Freq= n ()) %>%
mutate (nSamplesObserved = sum (Freq)) %>%
arrange (nSamplesObserved) %>%
select (ASV_name,nSamplesObserved) %>% distinct ()
vetASVs_2orMore = ASVs2Remove_df %>% filter (nSamplesObserved< 2 ) %>% select (ASV_name) %>% pull ()
vetASVs_3orMore = ASVs2Remove_df %>% filter (nSamplesObserved< 3 ) %>% select (ASV_name) %>% pull ()
saveRDS (vetASVs_2orMore,file = paste0 (savingdirOceanSlices,'/' ,'asvs_to_remove_oneSample' ))
saveRDS (vetASVs_3orMore,file = paste0 (savingdirOceanSlices,'/' ,'asvs_to_remove_twoSample' ))
This is the number of ASVs that are preset in exactly that amount of samples.
Lets look now the number of samples with how many ASVs,
Code
## now looking ASVs inside samples
nsamples = dframe_allASVs$ dframe %>% select (SampleID) %>% distinct () %>% nrow ()
plot0s_summ2 = dframe_allASVs$ dframe %>%
group_by (SampleKey) %>%
summarise (NumbersOfAsvInEachSample= n ()) %>%
arrange (NumbersOfAsvInEachSample) %>%
mutate (FreqAsvs= cut (NumbersOfAsvInEachSample,breaks = c (0 ,1 ,2 ,3 ,5 ,10 ,25 ,50 ,100 ,250 ,500 ),
right = T,include.lowest = T)) %>%
group_by (FreqAsvs) %>%
summarise (Freq= n (),Pct= n ()/ nsamples) %>%
mutate (CummPct = cumsum (Pct))
plot0s_summ2 %>% knitr:: kable () %>% kableExtra:: kable_styling ()
[0,1]
90
0.0911854
0.0911854
(1,2]
58
0.0587639
0.1499493
(2,3]
36
0.0364742
0.1864235
(3,5]
62
0.0628166
0.2492401
(5,10]
161
0.1631206
0.4123607
(10,25]
243
0.2462006
0.6585613
(25,50]
253
0.2563323
0.9148936
(50,100]
82
0.0830800
0.9979737
(100,250]
2
0.0020263
1.0000000
Lets now compare the distance matrix using mds, ltns, and umap.
Code
dim_reduce_obj <- dim_reduce_plots (dfComposition = dframe_allASVs$ ASV_composition)
dim_reduce_obj$ MDS2d_ASVs_Ait
Code
dim_reduce_obj$ TSNE_ASVs_Ait_2d
Code
dim_reduce_obj$ umap_ASVs_Ait_2d
Code
dframe2plus = tidy_grump (Dframe = dframe,vet_ASVs2remove = vetASVs_2orMore)
dim (dframe2plus$ ASV_composition)
We have 767 ASVs and 975 samples
Code
dim_reduce_obj2plus <- dim_reduce_plots (
dfComposition = dframe2plus$ ASV_composition,perplexityTsne = 100 )
dim_reduce_obj2plus$ MDS2d_ASVs_Ait
Code
dim_reduce_obj2plus$ TSNE_ASVs_Ait_2d
Code
dim_reduce_obj2plus$ umap_ASVs_Ait_2d
Code
dframe3plus = tidy_grump (Dframe = dframe,vet_ASVs2remove = vetASVs_3orMore)
dim (dframe3plus$ ASV_composition)
598 ASVs and 971 samples.
Code
dim_reduce_obj3plus <- dim_reduce_plots (dfComposition = dframe3plus$ ASV_composition,perplexityTsne = 100 )
dim_reduce_obj3plus$ MDS2d_ASVs_Ait
Code
dim_reduce_obj3plus$ TSNE_ASVs_Ait_2d
Code
dim_reduce_obj3plus$ umap_ASVs_Ait_2d
It feels that removing rare ASVs won’t help..
Here the idea is to aggregate some samples. So we would have compositions of ASVs but instead of samples we would have a less sparse and with less dimensions. The most natural could be:
Compositions of ASVs, under longhurst provinces
Compositions of ASVs, under longhurst provinces combined with different depths
Cruises?
Chunks of latitudes, and chunks of depths.
So for each aggregation we should take a look at mds, tsne, umap of the ait distances (or distance matrix using the CLR transformation)
So first lets do a bit of EDA to see what we can expect.
Code
dframe_allASVs$ dframe %>% select (SampleKey,Depth,Longhurst_Short) %>% distinct () %>%
ggplot (aes (Depth))+ geom_histogram ()
Code
dframe_allASVs$ dframe %>% select (SampleKey,Depth,Longhurst_Short) %>% distinct () %>%
ggplot (aes (Depth))+ geom_boxplot ()
Let’s slice the dephts of the ocean in 10%.
Code
distinct_depth <- dframe_allASVs$ dframe %>% select (SampleKey,Depth) %>% distinct ()
qtls_0.1 = quantile (distinct_depth$ Depth, seq (0 ,1 ,0.1 ))
qtlsNames = 1 : (length (qtls_0.1 )- 1 )
qtlsNames = ifelse (qtlsNames< 10 ,paste0 ('DGR0' ,qtlsNames),paste0 ('DGR' ,qtlsNames))
qtls_0.1
0% 10% 20% 30% 40% 50% 60%
0.00000 1.18800 15.00000 28.80741 45.00000 61.50000 81.29718
70% 80% 90% 100%
105.47600 150.64307 335.50200 2568.80000
Here are the depth quantiles.
Code
distinct_depth = distinct_depth %>%
mutate (CatDepth= cut (Depth,breaks = qtls_0.1 ,include.lowest = T,right = T,labels = qtlsNames))
distinct_depth %>% group_by (CatDepth) %>%
summarise (Freq= n ())
# A tibble: 10 × 2
CatDepth Freq
<fct> <int>
1 DGR01 99
2 DGR02 115
3 DGR03 82
4 DGR04 112
5 DGR05 86
6 DGR06 98
7 DGR07 99
8 DGR08 98
9 DGR09 99
10 DGR10 99
Code
dframe_allASVs$ dframe %>% select (SampleKey,Depth,Longhurst_Short) %>% distinct () %>%
mutate (CatDepth= cut (Depth,breaks = qtls_0.1 ,include.lowest = T,right = T,labels = qtlsNames)) %>%
ggplot (aes (CatDepth))+ geom_bar ()+
facet_wrap (~ Longhurst_Short,scales= 'free_y' )
This is the distribution of number of samples for each GDR (group depth rank for each longhurst province.
So our first tentative will be to create compositions of asvs with this amount of combinations (GDRxLH)
Code
LH_Depth_Composition_df= readRDS (file = paste0 (savingdir,'/' ,'LH_Depth_Composition_df' ))
LH_Depth_dimRed <- dim_reduce_plots (
dfComposition = LH_Depth_Composition_df,perplexityTsne = 100 ,
neighUmap = 250 )
LH_Depth_dimRed$ MDS2d_ASVs_Ait
Code
LH_Depth_dimRed$ TSNE_ASVs_Ait_2d
Code
LH_Depth_dimRed$ umap_ASVs_Ait_2d
Code
Lat_Depth_Composition_df= readRDS (file = paste0 (savingdir,'/' ,'Lat_Depth_Composition_df' ))
Lat_Depth_dimRed <- dim_reduce_plots (
dfComposition = Lat_Depth_Composition_df,perplexityTsne = 100 ,
neighUmap = 250 )
Lat_Depth_dimRed$ MDS2d_ASVs_Ait
Code
Lat_Depth_dimRed$ TSNE_ASVs_Ait_2d
Code
Lat_Depth_dimRed$ umap_ASVs_Ait_2d
#Creating Clusters
Let’s now create and evaluate the clusters using this two ocean slices that we have